require(adespatial)
require(rubias)
require(hierfstat)
require(scico)
require(ade4)
require(marmap)
require(vegan)
require(ggmap)
require(pegas)
require(adegenet)
require(tidyverse)
require(knitr)
require(magrittr)

Readme

This notebook is part of an rstudio project. If you’d like to pre-rendered figures, read a summary of analysis and view code, please open the relevant html file in a browser.

The full project with data (except raw sequencing data) and results is archived as a github repository at [https://github.com/david-dayan/chum_coastal_pilot]

Rationale and Outline

Here we conduct the analysis for the Oregon Coastal Chum Salmon Genetics Pilot Study (2019-2021) Oregon Department of Fish and Wildlife Conservation and Recovery Program using the Oke350 GTseq panel.

This notebook contains analysis logs and results from the four objectives of this project:
(1) Collect and analyze samples from the three largest coastal chum salmon populations (Nehalem, Tillamook, and Yaquina), and two additional basins (Siletz and Netarts) that often have a substantial number of spawners, to investigate genetic structure of coastal chum salmon populations.
(2) Collect and analyze samples from two major Tillamook sub-basins (Kilchis and Miami) to investigate whether there is significant genetic structure within the basin.
(3) Collect tissue samples from different anatomical locations of chum carcasses to investigate whether certain tissues are more likely to provide higher quality samples for analysis.
(4) Analyze a small number of archival chum scale samples to evaluate the potential for investigating historical population structure, using the large number of chum scale samples ODFW has collected through spawning grounds surveys over time.

We also answer a lingering question from a previous sequencing run using these new data: why did so many samples fail in the panel optimization run?

Data Summary

Sequencing Data

Run 1:
The first run was the test plate for the Oke350 panel. Raw compressed reads are at /dfs/Omalley_Lab/dayan/coastal_chinook_2020/full_panel/demux/keta_reads .

95 samples library from four populations spiked into a larger coastal chinook run. Demuxed with deML. The total number of reads was 50,680,864 .

Run 2:
Raw sequencing data is at /dfs/Omalley_Lab/dayan/chum_pilot_2021 and /dfs/Omalley_Lab/fitz/Runs/4774 . It was demultiplexed by the sequencing center

Raw data is from the SFGL Illumina 020 run.

285 samples total (including controls, replicates and samples from run 1 that required resequencing)

Intermediate data/results are in the directory /dfs/Omalley_Lab/dayan/chum_pilot_2021

Samples

Before filtering (duplicates removed) there were XXX samples, sample size per basin and spawning tributary (if available) is below.

meta_data <- readxl::read_xlsx("metadata/Oke GT-seq runs.xlsx", sheet = 6)

#create field of archival scale samples
meta_data %<>%
  mutate(tissue_type = case_when(str_detect(sample, "OkeCC13") ~ "scale",
                                 TRUE ~ "fresh"))
#note here that some samples in the genotype data were missing from the metadata ("OkeCC19MILC_0001" "OkeCC19NEHR_0001" "OkeCC19TILR_0102"), the data was available from other sources, manually added them to the "combined" on the metadata xslx.

kable(filter(meta_data, tissue_type == "fresh") %>% count(basin, detailed_location))
basin detailed_location n
Coos Coos River 3
Nehalem Nehalem River 50
Netarts Whiskey Creek 26
Siletz Bear Creek 3
Tillamook Kilchis River 50
Tillamook Miami River 50
Yaquina Mill Creek 64
Yaquina Simpson Creek 10
map_data <- filter(meta_data, tissue_type == "fresh") %>%
  dplyr::select(basin, detailed_location, lon, lat)

bound_box <- make_bbox(lon = c(-123,-125), lat = map_data$lat, f = .2)
map <- get_map(location = bound_box, maptype = "terrain", source = "google", zoom = 8)
ggmap(map) + geom_point(data = map_data, aes(x = lon, y = lat), color = "red")+ geom_text(data = distinct(map_data, basin, .keep_all = TRUE), aes(label = basin ), hjust = 1.2)

ggmap(map) + geom_point(data = map_data, aes(x = lon, y = lat), color = "red")+ geom_text(data = distinct(arrange(map_data, desc(lat)), basin, .keep_all = TRUE), aes(label = basin ), hjust = 1.2)+xlab("")+ylab("")

Note here that the number of Mill Creek samples is 20 greater than reflected in the sample summary from raw genotype data because 10 archival scale samples were not included in the library and 10 are excluded here (only do structure analysis on current samples).

Genotype Data
Detailed log of genotyping (raw reads to filtered genotypes) is available in a R notebook

The final filtered dataset includes 235 individuals and 325 markers (including 8 scale samples). Basin and subbasin sample sizes after filtering are below

#note here we also unify the metadata and population names
load("genotype_data/genind_2.0.R")
load("genotype_data/genotypes_2.2.R")

genos_full <- genos_2.0
genind_full <- genind_2.0

genos_full %<>%
  left_join(meta_data)

genind_full@pop <- as.factor(genos_full$basin)


#remove archive samples
genos_2.0 %<>%
  left_join(meta_data) %>%
  filter(tissue_type == "fresh")

genind_2.0 <- genind_2.0[!(str_detect(rownames(genind_2.0$tab ), "OkeCC13")),]
genind_2.0@pop <- as.factor(genos_2.0$basin)

kable(genos_2.0 %>%
  group_by(basin, detailed_location) %>%
  summarise(n = n(), missing_data = mean((100-`%GT`)/100))
)
basin detailed_location n missing_data
Coos Coos River 3 0.0552333
Nehalem Nehalem River 49 0.0276490
Netarts Whiskey Creek 13 0.1281462
Siletz Bear Creek 3 0.0495333
Tillamook Kilchis River 45 0.0600022
Tillamook Miami River 46 0.0696913
Yaquina Mill Creek 59 0.0443593
Yaquina Simpson Creek 9 0.0066778
kable(genos_2.0 %>%
  group_by(basin) %>%
  summarise(n = n(), missing_data = mean((100-`%GT`)/100))
)
basin n missing_data
Coos 3 0.0552333
Nehalem 49 0.0276490
Netarts 13 0.1281462
Siletz 3 0.0495333
Tillamook 91 0.0649000
Yaquina 68 0.0393721

Population Genetic Structure

PCA

First let’s took a look at the data in PC space.

X <- scaleGen(genind_2.0, NA.method="mean")


#then run pca
pca1 <- dudi.pca(X, scale = FALSE, scannf = FALSE, nf = 227)

#check pcs to keep with kaiser-guttman

#kaiser guttman
cutoff<-mean(pca1$eig)
kg <- length((pca1$eig)[(pca1$eig)>cutoff])
barplot(pca1$eig, main = "PCA eigenvalues")
abline(h = cutoff, col = "red")

#broken stick
n <- length(pca1$eig)
bsm <- data.frame(j=seq(1:n), p = 0)
bsm$p[1] <- 1/n
for (i in 2:n){
  bsm$p[i] <- bsm$p[i-1]+(1/(n+1-i))
  
}
bsm$p <- 100*bsm$p/n

pca_eigs_to_plot <- as.data.frame(cbind(100*pca1$eig/sum(pca1$eig)), rev(bsm$p))
pca_eigs_to_plot %<>%
  rownames_to_column(var = "bsm") %>%
  rename(pca_eig_perc = V1) %>%
  mutate(pca_eig_perc = as.numeric(pca_eig_perc))
  



#kept all PCs
snp_pcs <- pca1$li#[,c(1:kg)]

#now plot data
snp_pcs %<>%
  rownames_to_column("sample") %>%
  left_join(meta_data) %>%
  mutate(basin = fct_relevel(basin, "Nehalem", "Tillamook", "Netarts", "Siletz", "Yaquina", "Coos"))

ggplot(data = snp_pcs)+geom_point(aes(Axis1, Axis2, color = basin)) + stat_ellipse(aes(Axis1, Axis2, color = basin)) +theme_classic()+scale_color_scico_d()

ggplot(data = snp_pcs)+geom_point(aes(Axis1, Axis3, color = basin)) + stat_ellipse(aes(Axis1, Axis2, color = basin)) +theme_classic()+scale_color_scico_d()

#s.class(pca1$li, as.factor(snp_pcs$basin),xax=1,yax=2, col=transp(viridisLite::viridis(6),0.7), axesell=FALSE, cstar=0, cpoint=1, grid=FALSE)
#add.scatter.eig(pca1$eig[1:10],nf=3,xax=1,yax=2, posi = "bottomright" )

#3d plot as well
plotly::plot_ly(x=snp_pcs$Axis1, y=snp_pcs$Axis2, z=snp_pcs$Axis3, type="scatter3d", mode="markers", color=snp_pcs$basin, alpha = 0.8)
#and plot by approx distance
ggplot(data = snp_pcs)+geom_point(aes(Axis1, Axis2, color = lat)) +theme_classic()+scale_color_viridis_c()

ggplot(data = snp_pcs)+geom_point(aes(Axis1, Axis3, color = lat)) +theme_classic()+scale_color_viridis_c()

plotly::plot_ly(x=snp_pcs$Axis1, y=snp_pcs$Axis2, z=snp_pcs$Axis3, type="scatter3d", mode="markers", color=snp_pcs$lat, alpha = 0.8, colors = )
# publication plot (with bsm_+kaiser guttman criteria)
a <- ggplot(data = snp_pcs)+geom_point(aes(Axis1, Axis2, color = basin), alpha = 0.7, size =2) + stat_ellipse(aes(Axis1, Axis2, color = basin)) +theme_classic()+xlab("PC1\n2.1% of variance")+ylab("PC2\n1.7% of variance")+  theme(legend.text=element_text(size=12))+scale_color_manual(name = "Basin", values = c("#4477AA", "#66CCEE", "#228833", "#CCBB44", "#EE6677", "#AA3377" ))

pca_eigs_to_plot %<>%
  rowid_to_column("row_n") %>%
  mutate(bsm = as.numeric(bsm)) %>%
  pivot_longer(!row_n, names_to = "bsm_or_eig", values_to = "percent_variance")

ggplot(data = pca_eigs_to_plot[1:10,])+geom_bar(aes(x = as.factor(row_n), y = percent_variance, color = bsm_or_eig, fill = bsm_or_eig), stat = "identity", position=position_dodge())+theme_bw()

inset_scree <- pca_eigs_to_plot[1:16,] %>%
  filter( bsm_or_eig == "pca_eig_perc") %>%
  mutate(fillcolor = case_when(row_n < 3 ~ "a",
                               TRUE ~"b"))


b <- ggplot(data = inset_scree)+geom_bar(aes(x = as.factor(row_n), y = percent_variance,  fill = fillcolor), stat = "identity", position=position_dodge())+theme_classic()+theme(legend.position = "none", axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text = element_blank(), axis.ticks = element_blank())+scale_fill_manual(values = c( "grey15", "grey67"))

cowplot::ggdraw(a ) +
    cowplot::draw_plot(b, 0.11, 0.20, .25, .25)

PCA IBD There appears to be rough relationship between latitude and PC1 score, but this may be due to Yaquina vs all other structure (Coos is furthest south yet intermediate PC scores). We will formally examine IBD using mantel tests and spatial eigenalysis later, lets just quickly plot it

ggplot(data = snp_pcs)+geom_point(aes(lat, Axis1, color = basin))+geom_smooth(aes( lat, Axis1))+theme_bw()+scale_color_scico_d()

Sub-basin structure We should also examine for sub-basin structure. Let’s compare Miami to Kilchis Rivers.
First do they vary along the primary axis of variation found among all samples (PC1)?

ggplot(data = filter(snp_pcs, basin == "Tillamook"))+geom_point(aes(Axis1, Axis2, color = detailed_location)) +theme_classic()+scale_color_scico_d(begin = 0.2, end = 0.8)

#ggplot(data = filter(snp_pcs, basin == "Yaquina"))+geom_point(aes(Axis1, Axis2, color = detailed_location)) +theme_classic()+scale_color_scico_d(begin = 0.2, end = 0.8)

No, but, these populations may vary along a different axis (i.e. sub-basin structure may not reflect a smaller degree of separation along the same genetic axis). We could search for smaller PCs where the centroid varies across these two groups, or much easier for now, just conduct their own PCA.

Xtill <- scaleGen(seppop(genind_2.0)$Tillamook, NA.method="mean")
## Warning in .local(x, ...): Some scaling values are null.
##  Corresponding alleles are removed.
#then run pca
pca_till <- dudi.pca(Xtill, scale = FALSE, scannf = FALSE, nf = 320)

#check pcs to keep with kaiser-guttman

#kaiser guttman
cutoff<-mean(pca_till$eig)
kg <- length((pca_till$eig)[(pca_till$eig)>cutoff])
barplot(pca_till$eig, main = "PCA eigenvalues")
abline(h = cutoff, col = "red")

#kept all PCs
snp_pcs_till <- pca_till$li#[,c(1:kg)]

#now plot data
snp_pcs_till %<>%
  rownames_to_column("sample") %>%
  left_join(meta_data)
## Joining, by = "sample"
ggplot(data = snp_pcs_till)+geom_point(aes(Axis1, Axis2, color = detailed_location)) + stat_ellipse(aes(Axis1, Axis2, color = detailed_location)) +theme_classic()+scale_color_scico_d(begin = 0.2, end = 0.8, name = "River")

ggplot(data = snp_pcs_till)+geom_point(aes(Axis1, Axis3, color = detailed_location)) + stat_ellipse(aes(Axis1, Axis2, color = detailed_location)) +theme_classic()+scale_color_scico_d(begin = 0.2, end = 0.8)

# 
# Xyaq <- scaleGen(seppop(genind_2.0)$Yaquina, NA.method="mean")
# 
# 
# #then run pca
# pca_yaq <- dudi.pca(Xyaq, scale = FALSE, scannf = FALSE, nf = 320)
# 
# #check pcs to keep with kaiser-guttman
# 
# #kaiser guttman
# cutoff<-mean(pca_yaq$eig)
# kg <- length((pca_yaq$eig)[(pca_yaq$eig)>cutoff])
# barplot(pca_yaq$eig, main = "PCA eigenvalues")
# abline(h = cutoff, col = "red")
# 
# #kept all PCs
# snp_pcs_yaq <- pca_yaq$li#[,c(1:kg)]
# 
# #now plot data
# snp_pcs_yaq %<>%
#   rownames_to_column("sample") %>%
#   left_join(meta_data)
# 
# ggplot(data = snp_pcs_yaq)+geom_point(aes(Axis1, Axis2, color = detailed_location)) + stat_ellipse(aes(Axis1, Axis2, color = detailed_location)) +theme_classic()+scale_color_scico_d(begin = 0.2, end = 0.8)
# ggplot(data = snp_pcs_yaq)+geom_point(aes(Axis1, Axis3, color = detailed_location)) + stat_ellipse(aes(Axis1, Axis2, color = detailed_location)) +theme_classic()+scale_color_scico_d(begin = 0.2, end = 0.8)

Nope, very little difference in group centroids along the first two PCs in either Yaquina or Tillamook basins.

PCA Results Summary

The first 33 PCs are significant according to the Kaiser-Guttman criteria, however, an ad-hoc/by eye examination suggests only the first PC is revealing any real structure in the data. This first PC largely separates Yaquina from the rest of the samples. There is a relationship between latitude and PC1 scores, suggestive of IBD, but it may be driven by the large difference between the Yaquina and other samples, especially considering that Coos River samples demonstrate intermediate PC values despite being on the extreme south of the sampling area.

Diversity Metrics

Heterozygosity

Let’s examine patterns of heterozygosity and diversity next.

n.pop <- seppop(genind_2.0)

hobs <- lapply(n.pop, function(x) (summary(x)$Hobs))
hobs <- as.data.frame(t(do.call(rbind, hobs)))
hobs <- hobs %>%
  rownames_to_column(var="marker")
hobs <- hobs %>%
  pivot_longer(-marker, names_to = "basin", values_to = "Ho")
#ggplot(hobs)+geom_boxplot(aes(x=pop, y=hobs))+theme_classic()+xlab("Run Timing")+ylab("Observed Heterozygosity")

hexp <- lapply(n.pop, function(x) (summary(x)$Hexp))
hexp <- as.data.frame(t(do.call(rbind, hexp)))
hexp <- hexp %>%
  rownames_to_column(var="marker") %>%
  pivot_longer(-marker, names_to = "basin", values_to = "He")

marker_divs <- hexp %>%
  left_join(hobs) %>%
  pivot_longer(c("Ho", "He"), names_to = "obs_exp", values_to ="H") %>%
  mutate(basin = fct_relevel(basin, "Nehalem", "Tillamook", "Netarts", "Siletz", "Yaquina", "Coos"))

H_summary <- count(genos_2.0, basin)

#plot
ggplot(data=marker_divs)+geom_boxplot(aes(x=basin, y=H, fill = obs_exp))+theme_classic()+geom_text(data = H_summary,aes(basin, Inf, label = paste0("n = ", n), vjust = 1))+scale_fill_scico_d(begin = 0.2, end = 0.8, palette ="batlow", name = "")+xlab("Basin")

#table
marker_divs %>%
  group_by(basin, obs_exp) %>%
  summarise(mean = mean(H))
#spatial pattern?
#spatial_he <- marker_divs %>%
#   group_by(basin, obs_exp) %>%
#  filter(obs_exp == "He") %>%
#   summarise(mean = mean(H)) %>%
#  left_join(sites)
#summary(aov(data = spatial_he, mean~lat))

Significance Testing
Are these differences between populations significant? To test with the same number of loci (across populations) we’ll use a monte-carlo test and 999 permutations.

#
a <- Hs.test(n.pop$Coos, n.pop$Nehalem)
b <- Hs.test(n.pop$Coos, n.pop$Netarts)
c <- Hs.test(n.pop$Coos, n.pop$Siletz)
d <- Hs.test(n.pop$Coos, n.pop$Tillamook)
e <- Hs.test(n.pop$Coos, n.pop$Yaquina)

f <- Hs.test(n.pop$Nehalem, n.pop$Netarts)
g <- Hs.test(n.pop$Nehalem, n.pop$Siletz)
h <- Hs.test(n.pop$Nehalem, n.pop$Tillamook)
i <- Hs.test(n.pop$Nehalem, n.pop$Yaquina)

j <- Hs.test(n.pop$Netarts, n.pop$Siletz)
k <- Hs.test(n.pop$Netarts, n.pop$Tillamook)
l <- Hs.test(n.pop$Netarts, n.pop$Yaquina)

m <- Hs.test(n.pop$Siletz, n.pop$Tillamook)
n <- Hs.test(n.pop$Siletz, n.pop$Yaquina)

o <- Hs.test(n.pop$Tillamook, n.pop$Yaquina)


#now make a nice table
df <- as.data.frame(cbind(c("Coos", "Coos", "Coos", "Coos", "Coos", "Nehalem", "Nehalem","Nehalem","Nehalem", "Netarts", "Netarts", "Netarts", "Siletz", "Siletz", "Tillamook"), c("Nehalem", "Netarts", "Siletz", "Tillamook", "Yaquina", "Netarts", "Siletz", "Tillamook", "Yaquina", "Siletz", "Tillamook", "Yaquina",  "Tillamook", "Yaquina", "Yaquina"), c(a$pvalue,b$pvalue,c$pvalue,d$pvalue,e$pvalue,f$pvalue, g$pvalue, h$pvalue, i$pvalue, j$pvalue, k$pvalue, l$pvalue, m$pvalue, n$pvalue, o$pvalue)))
colnames(df) <- c("pop1", "pop2","p-value")
df$p.adj <- p.adjust(df$`p-value`,n =  nrow(df), method = "fdr")

kable(df)

rm(list = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "df"))

No basin pairs demonstrated significant differnce in expected heterozygosity after FDR correction.

Loci out of HWE

Are there significant departures from HWE at the loci level?

# here we use the hw.test function from pegas (exact test based on Monte Carlo permutations of alleles, 1000 permutations)
HWE.test <- lapply(n.pop, function(x) hw.test(x, B=1000))
# here we take the list of dataframes of p-values and combine into a single dataframe
hwe <- reduce(HWE.test, cbind)
hwe <- hwe[,c(4,8,12,16,20,24)]
colnames(hwe) <- c("Coos", "Nehalem", "Netarts", "Siletz", "Tillamook", "Yaquina")
hwe <- as.data.frame(hwe)

# next we correct for multiple comparisons
p.adj <- as.data.frame(apply(hwe, MARGIN = 2, function(x) p.adjust(x, "fdr")))
hwe_exceed <- p.adj %>% rownames_to_column(var="marker") %>%
  pivot_longer(-marker, names_to = "basin", values_to = "fdr") %>%
  filter(fdr < 0.1)

hwe_exceed <- hwe_exceed %>%
  left_join(pivot_wider(marker_divs, names_from = obs_exp, values_from = H)) %>%  
  mutate(direction = if_else(He > Ho, "excess_homo", "excess_hetero"))

a <- hwe_exceed%>%
  group_by(basin) %>%
  tally()

kable(a, caption = "Number of markers significantly out of HWE")

Only 1 marker (Oke_RDDFW16781_68)was out of HW proportions in a single population (Yaquina), with less observed heterozygosity than expected.

Heterozygosity Results Summary

No populations demonstrated signifance differences in the extent of genetic diversity as measured by He. Only a single marker in a single population did not follow Hardy-Weinberg proportions.

Differentiation

First let’s look at some dataset-wide basic Fstats

fstat <- genind2hierfstat(genind_2.0)
colnames(fstat) <- c(pop, names(genind_2.0$loc.n.all))

basicstats <- basic.stats(fstat)
kable(basicstats$overall, caption = "All markers Fstats")
All markers Fstats
x
Ho 0.3317
Hs 0.3202
Ht 0.3268
Dst 0.0066
Htp 0.3281
Dstp 0.0080
Fst 0.0203
Fstp 0.0242
Fis -0.0358
Dest 0.0117

Overall Fst is 0.020 and Fis is -0.04

Pairwise FST

Next let’s look at pairwise FST among basins

genet.dist(fstat, method="WC84")
##                  Coos     Nehalem     Netarts      Siletz   Tillamook
## Nehalem   0.019878392                                                
## Netarts   0.031610832 0.009869960                                    
## Siletz    0.032812760 0.017418342 0.036952394                        
## Tillamook 0.032519430 0.003442455 0.014195557 0.029883697            
## Yaquina   0.043404971 0.013729597 0.022842561 0.018033977 0.013462716

Moderate Fst among basin at the markers in the panel. Ranging from 0.003 to 0.043

What about within a basin. Let’s compare Miami to Kilchis samples

#check order for merge
#row.names(n.pop$Tillamook$tab)== filter(genos_2.0, basin == "Tillamook") %>% pull(sample) 
n.pop <- seppop(genind_2.0)
#add detailed_location as pop in genind
genind_till<- n.pop$Tillamook
genind_till@pop <- as.factor(filter(genos_2.0, basin == "Tillamook") %>% pull(detailed_location))
 
fstat_till <- genind2hierfstat(genind_till)
colnames(fstat_till) <- c(pop, names(genind_till$loc.n.all))
genet.dist(fstat_till, method="WC84")
##             Kilchis River
## Miami River   0.003610589
basic_till <- basic.stats(fstat_till)
basic_till$overall
##      Ho      Hs      Ht     Dst     Htp    Dstp     Fst    Fstp     Fis    Dest 
##  0.3433  0.3188  0.3194  0.0006  0.3199  0.0012  0.0018  0.0036 -0.0770  0.0017
#genind_yaq<- n.pop$Yaquina
#genind_yaq@pop <- as.factor(filter(genos_2.0, basin == "Yaquina") %>% pull(detailed_location))
 
#fstat_yaq <- genind2hierfstat(genind_yaq)
#colnames(fstat_yaq) <- c(pop, names(genind_yaq$loc.n.all))
#genet.dist(fstat_yaq, method="WC84")

#basic_till <- basic.stats(fstat_till)
#basic_till$overall

Very low differentiation at these markers (0.003 and 0).

site wise fst
for the spatial analysis it is probably preferable to split Kilchis and Miami, we’ll need an FST table across all sites, even better if it is in a clear order

#add detailed_location as pop in genind
genind_site<- genind_2.0
genind_site@pop <- as.factor(pull(genos_2.0, detailed_location))
 
fstat_site <- genind2hierfstat(genind_site)
colnames(fstat_site) <- c(pop, names(genind_site$loc.n.all))
gdist <- genet.dist(fstat_site, method="WC84")

gdist <- as.matrix(gdist)[c("Coos River", "Mill Creek", "Simpson Creek", "Bear Creek", "Whiskey Creek", "Kilchis River", "Miami River", "Nehalem River"), c("Coos River", "Mill Creek","Simpson Creek", "Bear Creek", "Whiskey Creek", "Kilchis River", "Miami River", "Nehalem River")]
gdist  <- as.dist(gdist)
gdist
##                  Coos River    Mill Creek Simpson Creek    Bear Creek
## Mill Creek     0.0374053458                                          
## Simpson Creek  0.0518401106 -0.0007962974                            
## Bear Creek     0.0325903075  0.0151952892  0.0292530648              
## Whiskey Creek  0.0405211671  0.0255668245  0.0253905076  0.0420544496
## Kilchis River  0.0297259520  0.0167876708  0.0230143056  0.0346796201
## Miami River    0.0318894937  0.0133700860  0.0174815862  0.0252004665
## Nehalem River  0.0266426446  0.0134105773  0.0167983616  0.0218513473
##               Whiskey Creek Kilchis River   Miami River
## Mill Creek                                             
## Simpson Creek                                          
## Bear Creek                                             
## Whiskey Creek                                          
## Kilchis River  0.0199631940                            
## Miami River    0.0166811287  0.0036105892              
## Nehalem River  0.0150288835  0.0034178897  0.0038004777

Marker level Fst

Now let’s look at the distribution of Fst among markers.

ggplot(basicstats$perloc)+geom_histogram(aes(x=Fst))+theme_classic()+ggtitle("among basin Fst")

ggplot(basic_till$perloc)+geom_histogram(aes(x=Fst))+theme_classic()+ggtitle("wihin Tillamook Fst")

#check if any non-run timing markers have high Fst
#max((basicstats$perloc[basicstats$perloc$run_timing=="non-run",])$Fst, na.rm = TRUE)

Yes, potentially some outliers here in the among basin comparisons, but without annotations yet, not much of interest we can say or do with this result. It would be good to get in touch with WDFW about how they chose the markers (reports/ms describing it all cite to something unpublished).

Spatial Analysis

Spatial Variables

We need a good set of spatial variables against which to examine the distribution of genetic variation. We will first generate a distance matrix based on alongshore distance, this use this distance matrix to generate Moran’s eigenvector maps for spatial eigenanalysis.

Distance Matrix

Rather than use line-of-sight for the distances among samples, we will attempt to estimate travel paths. We use mouth of the named tributary in the sample metadata. If no precise sampling location we use the mouth of the basin at the head of the bay.

#or_coast<- getNOAA.bathy(lon1 = -125, lon2 = -123, lat1 = 43, lat2 = 46, resolution = 1)
blues <- c("lightsteelblue4", "lightsteelblue3", "lightsteelblue2", "lightsteelblue1")
greys <- c(grey(0.6), grey(0.93), grey(0.99))

#downloaded a higher resolution bathymetry dataset from GEBCO (NOAA bathy had some sampling locations at >100m elevation)
or_coast <- readGEBCO.bathy("bathy/gebco_2020_n46.26621723175049_s42.488765716552734_w-124.53535079956053_e-122.70243644714354.nc")

trans<-trans.mat(or_coast, min.depth=17 , max.depth=-20)
sites <- distinct(genos_2.0, basin, detailed_location, lat, lon)
sites %<>%
  arrange(lat) %>%
  column_to_rownames(var = "detailed_location")

path <- lc.dist(trans, sites[,c(3,2)], res = "path")
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   4%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |====================================================================  |  96%
  |                                                                            
  |======================================================================| 100%
distance_matrix <- lc.dist(trans, sites[,c(3,2)], res = "dist")

#get.depth(or_coast, sites[,c(3,2)] , locator = FALSE)

#manually set the within Yaquina distance
distance_matrix[[8]] <- 11

distance_matrix
##               Coos River Mill Creek Simpson Creek Bear Creek Whiskey Creek
## Mill Creek           143                                                  
## Simpson Creek        143         11                                       
## Bear Creek           175         33            33                         
## Whiskey Creek        236         94            94         61              
## Kilchis River        253        110           110         78            18
## Miami River          253        111           111         78            19
## Nehalem River        271        128           128         96            37
##               Kilchis River Miami River
## Mill Creek                             
## Simpson Creek                          
## Bear Creek                             
## Whiskey Creek                          
## Kilchis River                          
## Miami River              10            
## Nehalem River            31          27
plot(or_coast, image = TRUE, land = TRUE, lwd = 0.03, bpal = list(c(0, max(or_coast), greys),c(min(or_coast), 0, blues)), xlim = c(-125, -123), ylim = c(43,46))
points(sites[,c(3,2)], pch = 20, col = "red", cex = 2.0 )
lapply(path, lines, col = "blue", lwd = 2, lty = 1)->dummy
Bathymetric map of sampling locations with minimum path between sites constrained by deep water (gold line - 20m)

Bathymetric map of sampling locations with minimum path between sites constrained by deep water (gold line - 20m)

#autoplot(or_coast, geom=c("r"))+ scale_fill_gradientn(values = scales::rescale(c(-3300, -100, 1, 2000)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+

#autoplot(atlantic2, geom=c("r")) + scale_fill_gradientn(values = scales::rescale(c(-6600, -10, 1, 2000)), colors = c("steelblue4", "#C7E0FF", "grey50", "grey80"))+geom_point(data= map_data, aes(lon, lat, color = mean_annual_max), size = 4)+scale_color_viridis(option = "plasma")+xlab("Longitude") +ylab("Latitude")+guides(fill=FALSE)+labs(color = "")+theme(legend.position = c(0.9, 0.2), legend.background = element_rect(fill = "transparent"))+annotate(geom = "text", x = -69, y = 31.2, label = "Mean Annual\nMaximum Temperature", angle= 90)

dbMEMs

Now let’s make some distance based MEMs from the distance matrix.

#make the distance matrix big

sites_big <- sites %>%
  rownames_to_column(var = "detailed_location") %>%
  right_join(dplyr::select(genos_2.0, detailed_location)) %>%
  mutate(lat_jitter <- jitter(lat, amount = 0.00001)) %>%
  mutate(lon_jitter <- jitter(lon, amount = 0.00001))
## Joining, by = "detailed_location"
distance_matrix_big <- lc.dist(trans, sites_big[,c(6,5)], res = "dist")


#dbmem <- dbmem(sites_big[,c(6,5)], MEM.autocor = "positive")
dbmem <- dbmem(distance_matrix_big, MEM.autocor = "all")
## Warning in dudi.pco(matdist, scannf = FALSE, nf = 2): Non euclidean distance
test <- moran.randtest(dbmem, nrepet = 999)
test
## class: krandtest lightkrandtest 
## Monte-Carlo tests
## Call: moran.randtest(x = dbmem, nrepet = 999)
## 
## Number of tests:   226 
## 
## Adjustment method for multiple comparisons:   none 
## Permutation number:   999 
##       Test           Obs     Std.Obs   Alter Pvalue
## 1     MEM1  2.259302e-02  51.0253268 greater  0.001
## 2     MEM2  1.198738e-02  44.0101200 greater  0.001
## 3     MEM3 -2.332452e-32  10.7603995 greater  0.001
## 4     MEM4 -3.386620e-33  10.4556632 greater  0.001
## 5     MEM5 -7.952695e-33  10.1370872 greater  0.001
## 6     MEM6 -1.285096e-32  10.7978630 greater  0.001
## 7     MEM7 -4.218639e-33   7.4072420 greater  0.006
## 8     MEM8 -3.612212e-32  10.3214330 greater  0.001
## 9     MEM9 -1.124023e-32   7.2043679 greater  0.001
## 10   MEM10 -6.064611e-33  10.4820936 greater  0.001
## 11   MEM11 -8.089773e-33  10.4100250 greater  0.001
## 12   MEM12 -8.776618e-32  11.6285612 greater  0.001
## 13   MEM13 -3.711985e-32  10.9971822 greater  0.001
## 14   MEM14 -5.716536e-33   9.2962660 greater  0.002
## 15   MEM15 -3.133331e-32  10.3901343 greater  0.001
## 16   MEM16 -1.929578e-32   7.3155525 greater  0.004
## 17   MEM17 -8.385185e-33  11.0419927 greater  0.001
## 18   MEM18 -7.961199e-32  10.5572181 greater  0.001
## 19   MEM19 -1.038598e-32  10.7643723 greater  0.001
## 20   MEM20 -9.324515e-33  10.4876805 greater  0.001
## 21   MEM21 -1.331230e-32  11.5179742 greater  0.001
## 22   MEM22 -9.011441e-33   9.9539058 greater  0.001
## 23   MEM23 -1.683635e-32  11.3478215 greater  0.001
## 24   MEM24 -8.038340e-33   8.8952031 greater  0.001
## 25   MEM25 -6.080108e-32  11.1636593 greater  0.001
## 26   MEM26 -2.252269e-32  10.4858950 greater  0.001
## 27   MEM27 -9.001426e-35   6.5985415 greater  0.012
## 28   MEM28 -3.035295e-33  11.7918714 greater  0.001
## 29   MEM29 -3.775837e-32  10.5357242 greater  0.001
## 30   MEM30 -1.497956e-31  11.8330958 greater  0.001
## 31   MEM31 -3.242558e-32  10.4422947 greater  0.001
## 32   MEM32 -6.307628e-33  10.8047418 greater  0.001
## 33   MEM33 -6.441469e-33  10.7047820 greater  0.001
## 34   MEM34 -1.334655e-32  10.6313352 greater  0.001
## 35   MEM35 -1.033136e-32  10.2432902 greater  0.001
## 36   MEM36 -9.028724e-33   9.2101062 greater  0.002
## 37   MEM37 -6.745484e-33  10.8573580 greater  0.001
## 38   MEM38 -7.403525e-33  10.8766845 greater  0.001
## 39   MEM39 -9.765357e-33  10.0414830 greater  0.001
## 40   MEM40 -1.271923e-32  11.2168940 greater  0.001
## 41   MEM41 -1.036164e-31   8.0797855 greater  0.007
## 42   MEM42 -1.449864e-32   9.6795331 greater  0.001
## 43   MEM43 -6.939815e-33  10.0573297 greater  0.001
## 44   MEM44 -3.864336e-32  10.2606630 greater  0.001
## 45   MEM45 -6.210094e-33   9.8419786 greater  0.001
## 46   MEM46 -8.309673e-33  10.1710899 greater  0.001
## 47   MEM47 -3.324814e-32  10.1809439 greater  0.001
## 48   MEM48 -3.930333e-32   9.7138434 greater  0.001
## 49   MEM49 -5.313351e-32   9.8855223 greater  0.001
## 50   MEM50 -9.456294e-33  11.0754937 greater  0.001
## 51   MEM51 -7.702160e-33   9.0754939 greater  0.001
## 52   MEM52 -1.003504e-32  10.8323560 greater  0.001
## 53   MEM53 -1.001643e-32  10.9415657 greater  0.001
## 54   MEM54 -4.362056e-03   0.1505112 greater  0.445
## 55   MEM55 -4.638660e-03  -0.2767679 greater  0.843
## 56   MEM56 -4.638660e-03  -0.3554596 greater  0.785
## 57   MEM57 -4.638660e-03  -0.3676613 greater  0.750
## 58   MEM58 -4.638660e-03  -0.4607852 greater  0.758
## 59   MEM59 -4.638660e-03  -0.4291300 greater  0.734
## 60   MEM60 -4.638660e-03  -0.4405975 greater  0.715
## 61   MEM61 -4.638660e-03  -0.4957272 greater  0.720
## 62   MEM62 -4.638660e-03  -0.4142401 greater  0.715
## 63   MEM63 -4.638660e-03  -0.4398420 greater  0.742
## 64   MEM64 -4.638660e-03  -0.5605418 greater  0.717
## 65   MEM65 -4.638660e-03  -0.5406192 greater  0.717
## 66   MEM66 -4.638660e-03  -0.5252224 greater  0.724
## 67   MEM67 -4.638660e-03  -0.5170971 greater  0.738
## 68   MEM68 -4.638660e-03  -0.4241012 greater  0.714
## 69   MEM69 -4.638660e-03  -0.4123586 greater  0.710
## 70   MEM70 -4.638660e-03  -0.5407038 greater  0.736
## 71   MEM71 -4.638660e-03  -0.4667298 greater  0.727
## 72   MEM72 -4.638660e-03  -0.5144093 greater  0.719
## 73   MEM73 -4.638660e-03  -0.5200527 greater  0.709
## 74   MEM74 -4.638660e-03  -0.6051728 greater  0.741
## 75   MEM75 -4.638660e-03  -0.5303088 greater  0.699
## 76   MEM76 -4.638660e-03  -0.6295848 greater  0.740
## 77   MEM77 -4.638660e-03  -0.5781889 greater  0.725
## 78   MEM78 -4.638660e-03  -0.3969220 greater  0.765
## 79   MEM79 -4.638660e-03  -0.5117518 greater  0.705
## 80   MEM80 -4.638660e-03  -0.4908546 greater  0.710
## 81   MEM81 -4.638660e-03  -0.4470374 greater  0.714
## 82   MEM82 -4.638660e-03  -0.5031459 greater  0.693
## 83   MEM83 -4.638660e-03  -0.5110826 greater  0.712
## 84   MEM84 -4.638660e-03  -0.6004968 greater  0.735
## 85   MEM85 -4.638660e-03  -0.5629896 greater  0.709
## 86   MEM86 -4.638660e-03  -0.4894438 greater  0.720
## 87   MEM87 -4.638660e-03  -0.4547146 greater  0.713
## 88   MEM88 -4.638660e-03  -0.4774758 greater  0.700
## 89   MEM89 -4.638660e-03  -0.5311684 greater  0.712
## 90   MEM90 -4.638660e-03  -0.5339532 greater  0.727
## 91   MEM91 -4.638660e-03  -0.5240116 greater  0.716
## 92   MEM92 -4.638660e-03  -0.5570631 greater  0.730
## 93   MEM93 -4.638660e-03  -0.4174593 greater  0.696
## 94   MEM94 -4.638660e-03  -0.5503549 greater  0.711
## 95   MEM95 -4.638660e-03  -0.5166426 greater  0.704
## 96   MEM96 -4.638660e-03  -0.5058899 greater  0.723
## 97   MEM97 -4.638660e-03  -0.5734451 greater  0.734
## 98   MEM98 -4.638660e-03  -0.4513986 greater  0.690
## 99   MEM99 -4.638660e-03  -0.5196424 greater  0.713
## 100 MEM100 -4.638660e-03  -0.4730278 greater  0.690
## 101 MEM101 -4.638660e-03  -0.4795323 greater  0.729
## 102 MEM102 -4.638660e-03  -0.4517684 greater  0.703
## 103 MEM103 -4.638660e-03  -0.5044942 greater  0.702
## 104 MEM104 -4.638660e-03  -0.5252695 greater  0.704
## 105 MEM105 -4.638660e-03  -0.5477151 greater  0.715
## 106 MEM106 -4.638660e-03  -0.5512867 greater  0.713
## 107 MEM107 -4.638660e-03  -0.5230924 greater  0.731
## 108 MEM108 -4.638660e-03  -0.4991599 greater  0.717
## 109 MEM109 -4.638660e-03  -0.5082520 greater  0.731
## 110 MEM110 -4.638660e-03  -0.5294486 greater  0.710
## 111 MEM111 -4.638660e-03  -0.5145499 greater  0.707
## 112 MEM112 -4.638660e-03  -0.5563976 greater  0.721
## 113 MEM113 -4.638660e-03  -0.5404053 greater  0.717
## 114 MEM114 -4.638660e-03  -0.5598164 greater  0.724
## 115 MEM115 -4.638660e-03  -0.5101859 greater  0.710
## 116 MEM116 -4.638660e-03  -0.5229712 greater  0.733
## 117 MEM117 -4.638660e-03  -0.5705661 greater  0.742
## 118 MEM118 -4.638660e-03  -0.5360472 greater  0.722
## 119 MEM119 -4.638660e-03  -0.4599668 greater  0.724
## 120 MEM120 -4.638660e-03  -0.4791597 greater  0.703
## 121 MEM121 -4.638660e-03  -0.5831944 greater  0.740
## 122 MEM122 -4.638660e-03  -0.5290682 greater  0.714
## 123 MEM123 -4.638660e-03  -0.5645660 greater  0.724
## 124 MEM124 -4.638660e-03  -0.5399788 greater  0.722
## 125 MEM125 -4.638660e-03  -0.5450267 greater  0.716
## 126 MEM126 -4.638660e-03  -0.5720650 greater  0.738
## 127 MEM127 -4.638660e-03  -0.5084557 greater  0.694
## 128 MEM128 -4.638660e-03  -0.5109740 greater  0.697
## 129 MEM129 -4.638660e-03  -0.4299371 greater  0.691
## 130 MEM130 -4.638660e-03  -0.5801220 greater  0.734
## 131 MEM131 -4.638660e-03  -0.5384878 greater  0.716
## 132 MEM132 -4.638660e-03  -0.4794197 greater  0.690
## 133 MEM133 -4.638660e-03  -0.5471429 greater  0.734
## 134 MEM134 -4.638660e-03  -0.5359550 greater  0.690
## 135 MEM135 -4.638660e-03  -0.5147043 greater  0.740
## 136 MEM136 -4.638660e-03  -0.4468834 greater  0.685
## 137 MEM137 -4.638660e-03  -0.5511291 greater  0.717
## 138 MEM138 -4.638660e-03  -0.5284930 greater  0.746
## 139 MEM139 -4.638660e-03  -0.5425211 greater  0.719
## 140 MEM140 -4.638660e-03  -0.4507217 greater  0.679
## 141 MEM141 -4.638660e-03  -0.4036453 greater  0.724
## 142 MEM142 -4.638660e-03  -0.5659070 greater  0.732
## 143 MEM143 -4.638660e-03  -0.5335484 greater  0.704
## 144 MEM144 -4.638660e-03  -0.5072321 greater  0.715
## 145 MEM145 -4.638660e-03  -0.4644341 greater  0.723
## 146 MEM146 -4.638660e-03  -0.5156770 greater  0.713
## 147 MEM147 -4.638660e-03  -0.4522387 greater  0.710
## 148 MEM148 -4.638660e-03  -0.5691498 greater  0.744
## 149 MEM149 -4.638660e-03  -0.5460539 greater  0.714
## 150 MEM150 -4.638660e-03  -0.4916194 greater  0.714
## 151 MEM151 -4.638660e-03  -0.5329015 greater  0.705
## 152 MEM152 -4.638660e-03  -0.5614534 greater  0.705
## 153 MEM153 -4.638660e-03  -0.4936998 greater  0.735
## 154 MEM154 -4.638660e-03  -0.5881094 greater  0.730
## 155 MEM155 -4.638660e-03  -0.5243625 greater  0.708
## 156 MEM156 -4.638660e-03  -0.5136245 greater  0.715
## 157 MEM157 -4.638660e-03  -0.5117964 greater  0.706
## 158 MEM158 -4.638660e-03  -0.4610917 greater  0.705
## 159 MEM159 -4.638660e-03  -0.5181523 greater  0.718
## 160 MEM160 -4.638660e-03  -0.5301228 greater  0.724
## 161 MEM161 -4.638660e-03  -0.5450203 greater  0.716
## 162 MEM162 -4.638660e-03  -0.4300954 greater  0.740
## 163 MEM163 -4.638660e-03  -0.5065372 greater  0.706
## 164 MEM164 -4.638660e-03  -0.4706821 greater  0.703
## 165 MEM165 -4.638660e-03  -0.4699634 greater  0.715
## 166 MEM166 -4.638660e-03  -0.3077797 greater  0.723
## 167 MEM167 -4.638660e-03  -0.4406705 greater  0.705
## 168 MEM168 -4.638660e-03  -0.5223274 greater  0.712
## 169 MEM169 -4.638660e-03  -0.4710238 greater  0.706
## 170 MEM170 -4.638660e-03  -0.2879066 greater  0.825
## 171 MEM171 -4.638660e-03  -0.4816580 greater  0.722
## 172 MEM172 -4.638660e-03  -0.5072163 greater  0.697
## 173 MEM173 -4.638660e-03  -0.4306310 greater  0.744
## 174 MEM174 -4.638660e-03  -0.4464374 greater  0.706
## 175 MEM175 -4.638660e-03  -0.3821307 greater  0.707
## 176 MEM176 -4.638660e-03  -0.5343492 greater  0.718
## 177 MEM177 -4.638660e-03  -0.4659285 greater  0.713
## 178 MEM178 -4.638660e-03  -0.2922082 greater  0.792
## 179 MEM179 -4.638660e-03  -0.4118791 greater  0.766
## 180 MEM180 -4.638660e-03  -0.4963737 greater  0.735
## 181 MEM181 -4.638660e-03  -0.5378953 greater  0.722
## 182 MEM182 -4.638660e-03  -0.4677347 greater  0.722
## 183 MEM183 -4.638660e-03  -0.3150404 greater  0.770
## 184 MEM184 -4.638660e-03  -0.4970078 greater  0.725
## 185 MEM185 -4.639073e-03  -0.3842474 greater  0.779
## 186 MEM186 -7.482221e-03  -8.0675011 greater  1.000
## 187 MEM187 -8.310192e-03  -9.7169572 greater  1.000
## 188 MEM188 -9.125331e-03 -11.4653451 greater  1.000
## 189 MEM189 -9.277320e-03 -10.0155616 greater  1.000
## 190 MEM190 -9.277320e-03  -9.8153449 greater  1.000
## 191 MEM191 -9.277320e-03 -10.2865240 greater  1.000
## 192 MEM192 -9.277320e-03 -11.7606683 greater  1.000
## 193 MEM193 -9.277320e-03 -12.3694408 greater  1.000
## 194 MEM194 -9.277320e-03 -11.8307093 greater  1.000
## 195 MEM195 -9.277320e-03 -10.6810845 greater  1.000
## 196 MEM196 -9.277320e-03 -11.7209110 greater  1.000
## 197 MEM197 -9.277320e-03 -10.9271116 greater  1.000
## 198 MEM198 -9.277320e-03 -11.4170281 greater  1.000
## 199 MEM199 -9.277320e-03 -11.5844398 greater  1.000
## 200 MEM200 -9.277320e-03 -11.3111900 greater  1.000
## 201 MEM201 -9.277320e-03 -11.0548909 greater  1.000
## 202 MEM202 -9.277320e-03 -12.1002113 greater  1.000
## 203 MEM203 -9.277320e-03 -11.6165036 greater  1.000
## 204 MEM204 -9.277320e-03 -11.0773157 greater  1.000
## 205 MEM205 -9.277320e-03 -11.6779234 greater  1.000
## 206 MEM206 -9.277320e-03  -9.9416817 greater  1.000
## 207 MEM207 -9.277320e-03 -11.4644978 greater  1.000
## 208 MEM208 -9.277320e-03 -12.6007301 greater  1.000
## 209 MEM209 -9.277320e-03 -11.8672756 greater  1.000
## 210 MEM210 -9.277320e-03  -9.2227755 greater  1.000
## 211 MEM211 -9.277320e-03 -11.0576080 greater  1.000
## 212 MEM212 -9.277320e-03 -10.8819596 greater  1.000
## 213 MEM213 -9.277320e-03  -7.0666440 greater  1.000
## 214 MEM214 -9.277320e-03 -11.8692201 greater  1.000
## 215 MEM215 -9.277320e-03  -9.8400719 greater  1.000
## 216 MEM216 -9.277320e-03 -12.7557903 greater  1.000
## 217 MEM217 -9.277320e-03  -9.9758861 greater  1.000
## 218 MEM218 -9.277798e-03  -9.8256749 greater  1.000
## 219 MEM219 -1.316238e-02 -21.3025195 greater  1.000
## 220 MEM220 -1.379955e-02 -13.8605407 greater  1.000
## 221 MEM221 -1.391258e-02 -18.8035537 greater  1.000
## 222 MEM222 -1.391598e-02 -20.3020757 greater  1.000
## 223 MEM223 -1.391598e-02 -19.1736192 greater  1.000
## 224 MEM224 -1.391598e-02 -21.3790561 greater  1.000
## 225 MEM225 -1.391601e-02 -18.2195500 greater  1.000
## 226 MEM226 -2.277720e-02 -35.8356862 greater  1.000
barplot(attr(dbmem, "values"), 
        main = "Eigenvalues of the spatial weighting matrix\nProportional to Moran's I\nFull Dataset", cex.main = 0.7)

dbmem <- dbmem(distance_matrix_big, MEM.autocor = "positive")
## Warning in dudi.pco(matdist, scannf = FALSE, nf = 2): Non euclidean distance
dbmem_sites <- as.data.frame(cbind(dbmem, sites_big))

dbmem_sites_summary <- dbmem_sites %>%
  group_by(detailed_location) %>%
  summarise(mem1 = mean(MEM1), mem2 = mean(MEM2))

map_data %<>%
  left_join(dbmem_sites_summary)
## Joining, by = "detailed_location"
map_data %<>%
  arrange(desc(lat))


a <- ggmap(map) + geom_point(data = map_data, aes(x = lon, y = lat, color = mem1), size = 2)+ geom_text(data = distinct(map_data, basin, .keep_all = TRUE), aes(label = basin ), hjust = 1.2)+scale_color_scico(palette = "batlow", name = "MEM1")+ggtitle("(a)")+xlab("")+ylab("")

b <- ggmap(map) + geom_point(data = map_data, aes(x = lon, y = lat, color = mem2), size = 2)+ geom_text(data = distinct(map_data, basin, .keep_all = TRUE), aes(label = basin ), hjust = 1.2)+scale_color_scico(palette = "batlow", name = "MEM2")+ggtitle("(b)")+xlab("")+ylab("")
cowplot::plot_grid(a,b)

all MEMs with positive moran’s I (e.g. autocorrelation) are significant. Keeping the first two. MEM1 largely captures the big distance between Coos Bay and all the other sampling sites. MEM2 is effectively distance away from Yaquina and captures spatial autocorrelation at a finer scale.

Mantel Test

Here we examine the correlation between spatial distance and linearized FST to test for IBD with a simple Mantel test

gdist_m <- as.matrix(gdist)
gdist_long <- data.frame(col=colnames(gdist_m)[col(gdist_m)], row=rownames(gdist_m)[row(gdist_m)], dist=c(gdist_m))
gdist_long %<>%
  filter(dist != 0) %>%
  rename(fst = dist) %>%
  mutate(lin_fst = fst/(1-fst))
  
gdist_linear <- as.dist(xtabs(gdist_long[, 4] ~ gdist_long[, 2] + gdist_long[, 1]))
gdist_linear
##                  Bear Creek    Coos River Kilchis River   Miami River
## Coos River     0.0336882168                                          
## Kilchis River  0.0359255029  0.0306366558                            
## Miami River    0.0258519476  0.0329399314  0.0036236728              
## Mill Creek     0.0154297487  0.0388588754  0.0170743087  0.0135512676
## Nehalem River  0.0223394953  0.0273719045  0.0034296117  0.0038149764
## Simpson Creek  0.0301345940  0.0546744396  0.0235564408  0.0177926296
## Whiskey Creek  0.0439006680  0.0422324763  0.0203698411  0.0169641092
##                  Mill Creek Nehalem River Simpson Creek
## Coos River                                             
## Kilchis River                                          
## Miami River                                            
## Mill Creek                                             
## Nehalem River  0.0135928655                            
## Simpson Creek -0.0007956638  0.0170853678              
## Whiskey Creek  0.0262376376  0.0152581971  0.0260519806
#reorder
gdist_linear <- as.matrix(gdist_linear)[c("Coos River", "Mill Creek","Simpson Creek", "Bear Creek", "Whiskey Creek", "Kilchis River", "Miami River", "Nehalem River"), c("Coos River", "Mill Creek", "Simpson Creek","Bear Creek", "Whiskey Creek", "Kilchis River", "Miami River", "Nehalem River")]
gdist_linear  <- as.dist(gdist_linear)

mt <- mantel(gdist_linear, distance_matrix) #999 permutation, pearson correlation
mt
## 
## Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## mantel(xdis = gdist_linear, ydis = distance_matrix) 
## 
## Mantel statistic r: 0.5398 
##       Significance: 0.04 
## 
## Upper quantiles of permutations (null model):
##   90%   95% 97.5%   99% 
## 0.443 0.519 0.575 0.611 
## Permutation: free
## Number of permutations: 999
geo_dist_long <- data.frame(col=colnames(as.matrix(distance_matrix))[col(as.matrix(distance_matrix))], row=rownames(as.matrix(distance_matrix))[row(as.matrix(distance_matrix))], dist=c(as.matrix(distance_matrix)))
geo_dist_long %<>%
  filter(dist != 0)

dist_long <- left_join(gdist_long, geo_dist_long)
## Joining, by = c("col", "row")
ggplot(dist_long)+geom_point(aes(dist, lin_fst), size = 3, alpha = 0.7)+theme_classic()+xlab("Alongshore Distance (km)")+ylab(expression(paste("F"[ST],"/(1-F"[ST], ")")))

Mantel Test is marginally significant (p 0.044), and highly explanatory (mantel r statistic = 0.54) suggesting that genetic differentiation among sampling locations is in part driven by IBD.

RDA

snp_rda_null <- rda(X ~ 1, data = dbmem)
snp_rda_full <- rda(X ~ . , data = dbmem)


#check that the full model is significant
anova(snp_rda_full) # 0.002 yes it is significant - proceed to variable selection
#what the variance explained
adjR2.rda_full <- RsquareAdj(snp_rda_full)$adj.r.squared
adjR2.rda_full
## [1] 0.008841477
snp_rda_final <- rda(X ~ MEM1 + MEM2 , data = dbmem)


#variable selection
# here we do variable selection with three different forward selection approaches
ordiR2 <- ordiR2step(snp_rda_null, scope = formula(snp_rda_full, data = dbmem))
## Step: R2.adj= 0 
## Call: X ~ 1 
##  
##                 R2.adjusted
## <All variables> 0.008841477
## + MEM2          0.005942339
## + MEM1          0.002859842
## <none>          0.000000000
## 
##        Df    AIC     F Pr(>F)   
## + MEM2  1 1468.1 2.351  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.005942339 
## Call: X ~ MEM2 
##  
##                 R2.adjusted
## <All variables> 0.008841477
## + MEM1          0.008841477
## <none>          0.005942339
## 
##        Df    AIC      F Pr(>F)   
## + MEM1  1 1468.4 1.6581  0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: R2.adj= 0.008841477 
## Call: X ~ MEM2 + MEM1 
## 
ordiR2$anova  
ordi <- ordistep(snp_rda_null, scope = formula(snp_rda_full, data = dbmem)) 
## 
## Start: X ~ 1 
## 
##        Df    AIC      F Pr(>F)   
## + MEM2  1 1468.1 2.3510  0.005 **
## + MEM1  1 1468.8 1.6482  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: X ~ MEM2 
## 
##        Df    AIC     F Pr(>F)   
## - MEM2  1 1468.5 2.351  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        Df    AIC      F Pr(>F)   
## + MEM1  1 1468.4 1.6581  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step: X ~ MEM2 + MEM1 
## 
##        Df    AIC      F Pr(>F)   
## - MEM1  1 1468.1 1.6581  0.005 **
## - MEM2  1 1468.8 2.3579  0.005 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ordi$anova
vif.cca(snp_rda_full)
## MEM1 MEM2 
##    1    1

The global model is significant (p < .001) with an adjusted R2 of 0.010. Both variable selection procedures suggest retaining MEM1 and MEM2

#testing signficance of constrained axes (how many are interesting)
rda_axis <- anova.cca(snp_rda_final, by = 'axis', parallel = 3)
rda_axis$p.adj <- p.adjust (rda_axis$`Pr(>F)`, method = 'fdr')
rda_axis
#testing significance of explanatory variables by margin (each conditioned on all the others)
rda_expl <-  anova.cca(snp_rda_final, by = 'margin', parallel = 3)
rda_expl$p.adj <- p.adjust (rda_expl$`Pr(>F)`, method = 'holm')
rda_expl$pve<- rda_expl$Variance/sum(rda_expl$Variance)

rda_expl
## tri plot
#site score
rda_sum<-summary(snp_rda_final, axes = c(2))
scores <- as.data.frame(rda_sum$sites)
scores$pop<-genind_2.0$pop


#species scores
arrows<-as.data.frame(rda_sum$biplot)

scores %<>%
  mutate(pop = fct_relevel(pop, "Nehalem", "Tillamook", "Netarts", "Siletz", "Yaquina", "Coos"))

#plot by pop
#ggplot()+geom_point(aes(RDA1, RDA2, color = pop, shape = pop), alpha = 0.7, size = 3, data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="red")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.4, y=RDA2+RDA2*0.4, label=c("MEM1", "MEM2")), size = 4,  color="red")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.0% of Total Variance")+ylab("Redundant Axis 2\n0.7% of Total Variance")+scale_color_scico_d(name = "Basin")+theme(legend.text = element_text(size = 12))+scale_shape_manual(name = "Basin", values = c(6,9,18,16,17,15))

ggplot()+geom_point(aes(RDA1, RDA2, color = pop), alpha = 0.7, size = 2, data = scores)+geom_segment(data=arrows, aes(x=0, y=0, xend=RDA1, yend=RDA2), arrow=arrow(length=unit(0.2,"cm")), color="red")+geom_text(data=arrows, aes(x=RDA1+RDA1*0.4, y=RDA2+RDA2*0.4, label=c("MEM1", "MEM2")), size = 4,  color="red")+stat_ellipse(aes(RDA1, RDA2, color = pop), data = scores)+theme_classic()+xlab("Redundant Axis 1\n1.0% of Total Variance")+ylab("Redundant Axis 2\n0.7% of Total Variance")+scale_color_manual(name = "Basin", values = c("#4477AA", "#66CCEE", "#228833", "#CCBB44", "#EE6677", "#AA3377" ))+theme(legend.text = element_text(size = 12))
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Warning: Removed 2 row(s) containing missing values (geom_path).

#variable loadings
snp_loading <- as.data.frame(scores(snp_rda_final, display = "species"))


 ggplot(data = snp_loading)+geom_histogram(aes(x =scale(RDA1)))+theme_classic()+ggtitle(paste("Full Pacific RDA 1\n",sum(abs(scale(snp_loading$RDA1)) > 3)/2, sep = "" ))+xlab("Z-scores of SNP loadings")+geom_vline(aes(xintercept = 3), color = "red")+geom_vline(aes(xintercept = -3), color = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#count the number of outliers 
sum(abs(scale(snp_loading$RDA1)) > 3)/2
## [1] 0
#which(scale(snp_loading$RDA1) > 3)
#[1] 275 364
#> snp_loading[275,]
#                         RDA1         PC1
#Oke_RDDFW29994_56.A 0.2364937 -0.06317717
# > snp_loading[364,]
#                         RDA1       PC1
# Oke_RDDFW61351_74.G 0.305528 -0.232033

Both RDA axes are significant and explain 1.0% and 0.7 of total variance respectively. Collectively, the two RDAs constrained ~1.76% of variance, given that the global FST in the dataset was 0.0203, this means we were able to explain 87% of among population variation as spatial autocorrelation among samples. The primary axis of constrained variation (RDA1) was driven by MEM2 which largely captures distance away from Yaquina. The second axis was driven mostly by mem1 and separates Coos Bay samples from all others.

STRUCTURE

Here we use a bayesian, model based clustering algorithm (STRUCTURE) to infer population structure and estimate admixture proportions of individual samples.

First we need to get our dataset ready for structure: remove linked loci, convert to structure format.

# first lets calculate LD (dartR has a great (fast) ld estimator that works right on genind files, so let's use this)
ldreport <- dartR::gl.report.ld(dartR::gi2gl(genind_2.0, verbose = 0), name = NULL, save = FALSE, nchunks = 2, ncores = 3, chunkname = NULL, verbose = 0)
##   Processing a SNP dataset
## Start conversion....
## Please note conversion of bigger data sets will take some time!
## Once finished, we recommend to save the object using >save(object, file="object.rdata")

We’ll prune (keep one) the dataset of any locus-pairs with r2 > 0.2, then convert to STRUCTURE format

unlinked_genind <- genind_2.0[loc=-unique(ldreport[ldreport$R2>0.2,]$loc2)]
rm(ldreport)
#note just sort of crashed through this with a text editor, not easily logged, but the general idea was transpose the data, split columns (diploid to dual haploid) then convert data to integers
df <- genind2df(unlinked_genind)
df <- as.data.frame(t(df))
write_tsv(df, "genotype_data/all.str.tmp")
#do stuff here
# Coos 1
# Yaquina 2
# Nehalem 3
# Netarts 4
# Siletz 5
# Tillamook 6

df <- read_tsv("genotype_data/all.str.tmp", col_names = FALSE)
df <- t(df)
write_tsv(as.data.frame(df), "genotype_data/all.str", col_names = FALSE)

Removed 7 high LD loci

Run Log

Structure was run in a GUI outside this computation notebook’s environment.
admixture model: admixture, with correlated allele frequency
burnin/mcmc: ran with k=1-6 for 100k iteration to check for convergence used 20,000/40,000 for final runs
replicates: did 10 replicates for k=1-6

Best K was chosen by the evanno method, and estimated in structure harvester.
Replicate results within each K were clumpp’d using the clumpak algorithm on the clumpak webserver

Results

Here we visualize the structure results of the clumpp’d results of all K values

Best K

Best K was 2 according to the Evanno (delta K) method, however, it’s important to remember the bias toward k=2 when differentiation is low or there is no population structure using this method. Delta k literally cannot evaluate K=1. (see note below)

note: delta K suggests best k is two, however, particularly at low differentiation, the delta k method is biased towards k=2 (cullingham 2020), seems like a good place to remind myself that k is a model that doesn’t always fully catpure biological reality and comparing results across different levels of k can proide interesting insights, even when best k is unknown, particularly in the case of low differentiation.

Structure Plots

Next let’s take the clumppd results and make some publication-ready figures.

#import clump results into dataframes
# results are in files k*/majorcluster/clumppfiles/clumpindoutput
# took these files and captured the relevant data with a text editor (original input files are a mess with multiple field separators) and saved to new files
k2 <- read_tsv("./structure/formatted_results/k2.txt")
k3 <- read_tsv("./structure/formatted_results/k3.txt")
k4 <- read_tsv("./structure/formatted_results/k4.txt")
k5 <- read_tsv("./structure/formatted_results/k5.txt")
k6 <- read_tsv("./structure/formatted_results/k6.txt")

levels_pop <- c("Nehalem", "Tillamook", "Netarts", "Siletz", "Yaquina", "Coos")

k2 %<>%
  mutate(pop =  factor(pop, levels = levels_pop)) %>%
  arrange(pop)  
k3 %<>%
  mutate(pop =  factor(pop, levels = levels_pop)) %>%
  arrange(pop) 
k4 %<>%
  mutate(pop =  factor(pop, levels = levels_pop)) %>%
  arrange(pop) 
k5 %<>%
  mutate(pop =  factor(pop, levels = levels_pop)) %>%
  arrange(pop) 
k6 %<>%
  mutate(pop =  factor(pop, levels = levels_pop)) %>%
  arrange(pop) 

plot_data <- k2 %>% 
  rownames_to_column(var="id") %>% 
 # sample the half_pounder and fall fish to smaller size for plot
  gather('cluster', 'prob', clust1:clust2) %>%
  group_by(id) %>% 
  mutate(likely_assignment = cluster[which.max(prob)]) %>%
  mutate(
         assingment_prob = max(prob)) %>% 
  arrange(likely_assignment, desc(assingment_prob)) %>% 
  ungroup()


a <- ggplot(plot_data, aes(id, prob, fill = cluster)) +
  geom_col(width=1.0) +
  facet_grid(~pop, scales = 'free', space = 'free', switch = "x") +
  scale_y_continuous(expand = c(0, 0), labels = NULL) +
  scale_x_discrete(expand = expand_scale(add = 1)) +
  theme(panel.spacing=unit(0.1, "lines"), axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(), legend.position = "none", axis.title.y=element_blank(),axis.ticks.y=element_blank(), strip.background = element_rect(color = "white", fill = "white"), strip.text.x = element_blank())+
  scale_fill_manual(values = c("#4477AA", "#AA3377" ))

plot_data <- k3 %>% 
  rownames_to_column(var="id") %>% 
  gather('cluster', 'prob', clust1:clust3) %>%
  group_by(id) %>% 
  mutate(likely_assignment = cluster[which.max(prob)],
         assingment_prob = max(prob)) %>% 
  arrange(likely_assignment, desc(assingment_prob)) %>% 
  ungroup()

b <- ggplot(plot_data, aes(id, prob, fill = cluster)) +
  geom_col(width=1.0) +
  facet_grid(~pop, scales = 'free', space = 'free', switch = "x") +
  scale_y_continuous(expand = c(0, 0), labels = NULL) +
  scale_x_discrete(expand = expand_scale(add = 1)) +
  theme(panel.spacing=unit(0.1, "lines"), axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(), legend.position = "none", axis.title.y=element_blank(),axis.ticks.y=element_blank(), strip.background = element_rect(color = "white", fill = "white"), strip.text.x = element_blank()) +
  scale_fill_manual(values = c("#4477AA",  "#CCBB44", "#AA3377" ))

plot_data <- k4 %>% 
  rownames_to_column(var="id") %>% 
  gather('cluster', 'prob', clust1:clust4) %>%
  group_by(id) %>% 
  mutate(likely_assignment = cluster[which.max(prob)],
         assingment_prob = max(prob)) %>% 
  arrange(likely_assignment, desc(assingment_prob)) %>% 
  ungroup()

c <- ggplot(plot_data, aes(id, prob, fill = cluster)) +
  geom_col(width=1.0) +
  facet_grid(~pop, scales = 'free', space = 'free', switch = "x") +
  scale_y_continuous(expand = c(0, 0), labels = NULL) +
  scale_x_discrete(expand = expand_scale(add = 1)) +
  theme(panel.spacing=unit(0.1, "lines"), axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(), legend.position = "none", axis.title.y=element_blank(), axis.ticks.y=element_blank(),strip.background = element_rect(color = "white", fill = "white"), strip.text.x = element_blank()) +
  scale_fill_manual(values = c("#4477AA",  "#228833", "#CCBB44", "#AA3377" ))

plot_data <- k5 %>% 
  rownames_to_column(var="id") %>% 
  gather('cluster', 'prob', clust1:clust5) %>%
  group_by(id) %>% 
  mutate(likely_assignment = cluster[which.max(prob)],
         assingment_prob = max(prob)) %>% 
  arrange(likely_assignment, desc(assingment_prob)) %>% 
  ungroup()

d <- ggplot(plot_data, aes(id, prob, fill = cluster)) +
  geom_col(width=1.0) +
  facet_grid(~pop, scales = 'free', space = 'free', switch = "x") +
  scale_y_continuous(expand = c(0, 0), labels = NULL) +
  scale_x_discrete(expand = expand_scale(add = 1)) +
  theme(panel.spacing=unit(0.1, "lines"), axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(), legend.position = "none", axis.title.y=element_blank(),axis.ticks.y=element_blank(), strip.background = element_rect(color = "white", fill = "white"), strip.text.x = element_blank())+scale_fill_manual(values = c("#4477AA",  "#228833", "#CCBB44", "#EE6677", "#AA3377" ))

plot_data <- k6 %>% 
  rownames_to_column(var="id") %>% 
  gather('cluster', 'prob', clust1:clust6) %>%
  group_by(id) %>% 
  mutate(likely_assignment = cluster[which.max(prob)],
         assingment_prob = max(prob)) %>% 
  arrange(likely_assignment, desc(assingment_prob)) %>% 
  ungroup()

e <- ggplot(plot_data, aes(id, prob, fill = cluster)) +
  geom_col(width=1.0) +
  facet_grid(~pop, scales = 'free', space = 'free', switch = "x") +
  scale_y_continuous(expand = c(0, 0), labels = NULL) +
  scale_x_discrete(expand = expand_scale(add = 1)) +
  theme(panel.spacing=unit(0.1, "lines"), axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),axis.ticks.y=element_blank(), legend.position = "none", axis.title.y=element_blank(), strip.background = element_rect(color = "white", fill = "white"), strip.text.x = element_text(angle = 90)) +
  scale_fill_manual(values = c("#4477AA", "#66CCEE", "#228833", "#CCBB44", "#EE6677", "#AA3377" ))

cowplot::plot_grid(a,b,c,d,e, rel_heights = c(1,1,1,1,2.0) ,ncol=1)

GSI

Do we have enough data and is there sufficient differentiation to assign individuals back to their basin of origin? Let’s run rubias to check.

NOTE: Turned this off in the final notebook rendering. Decided to keep it out of report.

set up dataset

# we need to make some changes to the  dataset (nucleotide to integer, once column per allele etc)

df <- genind2df(unlinked_genind)
df %<>%
  rownames_to_column("indiv")
write_tsv(df, "genotype_data/unlinked_raw.txt")

#write_tsv(rogue_baseline, "rogue_baseline.txt")
#made changes using regex in a text editor
# first split columns of genotype data
# then duplicated headers: find: ([a-zA-Z0-9\-_]+), replace: \1\t\1_1

Reference Self Assignment

baseline <- read_tsv("GSI/baseline.txt")
baseline %<>%
  mutate(across(everything(), as.character)) %>%
  mutate(collection = repunit) %>%
  relocate(collection , .after = repunit)

#actually, no, let's put the colelction in there correctly
baseline %<>%
  left_join(dplyr::select(genos_2.0, sample, detailed_location), by = c("indiv" = "sample")) %>%
  mutate(collection = detailed_location) %>%
  dplyr::select(-c(detailed_location))

sa <- self_assign(reference = baseline, gen_start_col = 5)

#summarise by reporting unit
sa_to_repu <- sa %>%
  group_by(indiv, collection, repunit, inferred_repunit) %>%
  summarise(repu_scaled_like = sum(scaled_likelihood))

# for each individual, assign to most likely reporting unit
sa_assign <- sa_to_repu %>%
  group_by(indiv) %>%
  slice_max(repu_scaled_like)

sa_assign$correct_assignment <- sa_assign$repunit == sa_assign$inferred_repunit

sum(sa_assign$correct_assignment/nrow(sa_assign))

sa_assign %>%
  group_by(repunit) %>%
  summarise(correct_assignment_rate = sum(correct_assignment ==TRUE)/n(), sample_size = n())

LOO Self-assignment assigns (maximum scaled likelihood) reference individuals back to correct reporting unit (basin) 71% of time, however there was a high degree of variance among basins, with the assignment rate highly dependent on sample size.

Simulated Mixtures

Let’s do a similar analysis on a simulated mixture.

Here we conduct a 500 simulations of a mixture of 200 samples drawn at equal rates from the reporting units.

ref_sims_no_prior <- assess_reference_loo(reference = baseline, 
                     gen_start_col = 5, 
                     reps = 500, 
                     mixsize = 200,
                     )

tmp <- ref_sims_no_prior %>%
  group_by(iter, repunit) %>%
  summarise(true_repprop = sum(true_pi), 
            reprop_posterior_mean = sum(post_mean_pi),
            repu_n = sum(n)) %>%
  mutate(repu_n_prop = repu_n / sum(repu_n))

ggplot(tmp, aes(x = true_repprop, y = reprop_posterior_mean, colour = repunit)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1) +
  facet_wrap(~ repunit)+scale_color_scico_d()

Using equal priors acorss the two reporting units the true vs inferred mixture proprotions (above) show higly variable residuals in estimation of mixing proportion. We tend to assume more fish in the mixture are from systems with high samples sizes and underassign to those with low

Now let’s check how reliable an individual assignment (posterior probability) is from these simulations.

ref_sims_no_prior_indivs <- assess_reference_loo(reference = baseline, 
                     gen_start_col = 5, 
                     reps = 500, 
                     mixsize = 200,
                     return_indiv_posteriors = TRUE)

# summarise things
repu_pofzs <- ref_sims_no_prior_indivs$indiv_posteriors %>%
  filter(repunit == simulated_repunit) %>%
  group_by(iter, indiv, simulated_collection, repunit) %>%  # first aggregate over reporting units
  summarise(repu_PofZ = sum(PofZ)) %>%
  ungroup() %>%
  arrange(repunit, simulated_collection) %>%
  mutate(simulated_collection = factor(simulated_collection, levels = unique(simulated_collection)))
#> `summarise()` regrouping output by 'iter', 'indiv', 'simulated_collection' (override with `.groups` argument)

# also get the number of simulated individuals from each collection
num_simmed <- ref_sims_no_prior_indivs$indiv_posteriors %>%
  group_by(iter, indiv) %>%
  slice(1) %>%
  ungroup() %>%
  count(simulated_collection)
  
# note, the last few steps make simulated collection a factor so that collections within
# the same repunit are grouped together in the plot.

# now, plot it
ggplot(repu_pofzs, aes(x = simulated_collection, y = repu_PofZ)) +
  geom_boxplot(aes(colour = repunit)) +
  geom_text(data = num_simmed, mapping = aes(y = 1.025, label = n), angle = 90, hjust = 0, vjust = 0.5, size = 3) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 9, vjust = 0.5)) +
  ylim(c(0, 1.25))+scale_color_scico_d()+theme_classic()

This follows the result above. Very low assignment accuracy at the individual level at all but Tillamook and Yaquina.

Sub-basin assignment

Is fst = 0.002 sufficient for GSI within a basin? Let’s check

coll_pofzs <- ref_sims_no_prior_indivs$indiv_posteriors %>%
  filter(collection == simulated_collection) %>%
  arrange(repunit, simulated_collection) %>%
  mutate(simulated_collection = factor(simulated_collection, levels = unique(simulated_collection)))
#> `summarise()` regrouping output by 'iter', 'indiv', 'simulated_collection' (override with `.groups` argument)

# also get the number of simulated individuals from each collection
num_simmed <- ref_sims_no_prior_indivs$indiv_posteriors %>%
  group_by(iter, indiv) %>%
  slice(1) %>%
  ungroup() %>%
  count(simulated_collection)
  
# note, the last few steps make simulated collection a factor so that collections within
# the same repunit are grouped together in the plot.

# now, plot it
ggplot(coll_pofzs, aes(x = simulated_collection, y = PofZ)) +
  geom_boxplot(aes(colour = simulated_collection)) +
  geom_text(data = num_simmed, mapping = aes(y = 1.025, label = n), angle = 90, hjust = 0, vjust = 0.5, size = 3) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 9, vjust = 0.5)) +
  ylim(c(0, 1.25))+scale_color_scico_d()

No, pretty poor assignment to Kilchis and Miami.

Tissue Sampling Issues

Panel optimization GT failure

In the panel optimization run, several samples failed to genotype. These were included in this panel as well to determine if the failure to genotype was due to sample quality or other issues.

# import data from test plates
test_run_genos <- read_csv("previous_run_genotype_data/Oke_GTs_0.1.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   `Raw Reads` = col_double(),
##   `On-Target Reads` = col_double(),
##   `%On-Target` = col_double(),
##   `%GT` = col_double(),
##   IFI = col_double()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 95 parsing failures.
## row col    expected      actual                                         file
##   1  -- 356 columns 357 columns 'previous_run_genotype_data/Oke_GTs_0.1.csv'
##   2  -- 356 columns 357 columns 'previous_run_genotype_data/Oke_GTs_0.1.csv'
##   3  -- 356 columns 357 columns 'previous_run_genotype_data/Oke_GTs_0.1.csv'
##   4  -- 356 columns 357 columns 'previous_run_genotype_data/Oke_GTs_0.1.csv'
##   5  -- 356 columns 357 columns 'previous_run_genotype_data/Oke_GTs_0.1.csv'
## ... ... ........... ........... ............................................
## See problems(...) for more details.
# import unfiltered genotype data
raw_genos <- read_csv("genotype_data/Oke_2021_coastal_genos_0.1.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   `Raw Reads` = col_double(),
##   `On-Target Reads` = col_double(),
##   `%On-Target` = col_double(),
##   `%GT` = col_double(),
##   IFI = col_double()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 383 parsing failures.
## row col    expected      actual                                           file
##   1  -- 356 columns 357 columns 'genotype_data/Oke_2021_coastal_genos_0.1.csv'
##   2  -- 356 columns 357 columns 'genotype_data/Oke_2021_coastal_genos_0.1.csv'
##   3  -- 356 columns 357 columns 'genotype_data/Oke_2021_coastal_genos_0.1.csv'
##   4  -- 356 columns 357 columns 'genotype_data/Oke_2021_coastal_genos_0.1.csv'
##   5  -- 356 columns 357 columns 'genotype_data/Oke_2021_coastal_genos_0.1.csv'
## ... ... ........... ........... ..............................................
## See problems(...) for more details.
raw_genos %<>%
  mutate(Sample = str_extract(Sample, "[:upper:][:lower:]{2}[AJCU][RC]\\d{2}\\w{4}_\\d{4}")) %>%
  dplyr::select(Sample, `Raw Reads`, `On-Target Reads`, `%On-Target`, `%GT`, IFI)

shared_genos <- raw_genos %>%
  inner_join(dplyr::select(test_run_genos,Sample, `Raw Reads`, `On-Target Reads`, `%On-Target`, `%GT`, IFI), by = c("Sample" = "Sample"))

ggplot(data = shared_genos)+geom_point(aes(`%GT.x`, `%GT.y`))+geom_abline(aes(slope = 1, intercept = 0))+theme_classic()+xlab("this run %GT")+ylab("previous run %GT")

ggplot(data = shared_genos)+geom_point(aes(`%On-Target.x`, `%On-Target.y`))+geom_abline(aes(slope = 1, intercept = 0))+theme_classic()+xlab("this run %On-Target")+ylab("previous run %On-Target")

Most samples improved when run a second time. Need to check if they were rextracted or purified on this run, or just multiplexed into the library a second time

Tissue Type

Some samples have metadata of alternative tissue sampling, let’s compare these to see if any worked better than others

#clean up the metadata
# a lot of the tissue type data is not useful, different names used for the likely the same tissue type, multiple tissue types in one sample, tried to clean up best I could: all muscle grouped together, mulitple tissue flagged, unclear markers as unknown
# tissue is fin unless otherwise noted

#prep the data
tissue_data <- read_tsv("metadata/tissue_type.txt")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   sample = col_character(),
##   tissue_type_raw = col_character(),
##   tissue_type = col_character()
## )
tissue_data %<>%
  mutate(tissue_type = case_when(is.na(tissue_type) ~ "fin",
                                TRUE ~ tissue_type))

raw_genos %<>%
  left_join(tissue_data, by = c("Sample" = "sample")) %>%
  mutate(tissue_type = case_when(str_detect(Sample, "OkeCC13") ~ "scale",
                                 TRUE ~ tissue_type))

raw_genos %>%
  group_by(tissue_type) %>%
  summarise(mean = mean(`%On-Target`), n = n()) %>%
  arrange(desc(n))
raw_genos %>%
  group_by(tissue_type) %>%
  summarise(mean = mean(`On-Target Reads`), n = n()) %>%
  arrange(desc(n))
#only plot major tissue sample type(NA, multiple, and n < 5 excluded, also exclude archival samples (scales))
ggplot(data = filter(raw_genos, tissue_type %in% c("fin", "operculum punch", "muscle")))+geom_boxplot(aes(tissue_type, `On-Target Reads`))+theme_bw()

ggplot(data = filter(raw_genos, tissue_type %in% c("fin", "operculum punch", "muscle")))+geom_boxplot(aes(tissue_type, `%On-Target`))+theme_bw()

summary(aov(data = filter(raw_genos, tissue_type %in% c("fin", "operculum punch", "muscle")), `On-Target Reads` ~ tissue_type ))
##              Df    Sum Sq   Mean Sq F value Pr(>F)
## tissue_type   2 2.077e+11 1.038e+11   1.068  0.345
## Residuals   347 3.375e+13 9.725e+10
summary(aov(data = filter(raw_genos, tissue_type %in% c("fin", "operculum punch", "muscle")), `%On-Target` ~ tissue_type ))
##              Df Sum Sq Mean Sq F value Pr(>F)
## tissue_type   2    684   342.2   1.643  0.195
## Residuals   347  72263   208.3

No significant differences between major tissue sample types. Fin performed best, but the mean %On Target Reads was only a 13% improvement over operculum punch and 23% improvement over muscle. Given that muscle was reported to be hard to work with during extractions, perhaps fins and operculum punches should be prioritized.

Archival Samples

Could we successfuly extract DNA and genotype samples from archival (2013) scales? Let’s compare the genotyping success between major fresh tissue type and archived scales.

raw2 <- filter(raw_genos, tissue_type %in% c("fin", "operculum punch", "muscle", "scale"))

raw2 %<>%
  mutate(fresh_scale = case_when(tissue_type %in% c("fin", "operculum punch", "muscle") ~ "fresh",
                                 tissue_type == "scale" ~ "scale"))

kable(raw2 %>%
        group_by(fresh_scale) %>%
  summarise(mean_on_target = mean(`On-Target Reads`), mean_on_target_perc = mean(`%On-Target`), mean_GT_perc = mean(`%GT`)))
fresh_scale mean_on_target mean_on_target_perc mean_GT_perc
fresh 261139.6 18.60226 85.6968
scale 142650.7 10.64500 74.7710
summary(aov(data =raw2, `%On-Target` ~ fresh_scale ))
##              Df Sum Sq Mean Sq F value Pr(>F)  
## fresh_scale   1    616   615.6    2.96 0.0862 .
## Residuals   358  74458   208.0                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(data =raw2, `On-Target Reads` ~ fresh_scale ))
##              Df    Sum Sq   Mean Sq F value Pr(>F)
## fresh_scale   1 1.365e+11 1.365e+11   1.427  0.233
## Residuals   358 3.425e+13 9.568e+10
kable(raw2 %>%
 group_by(tissue_type) %>%
 summarise(n = n(), n_retained = sum(`%GT`> 80)/n()))
tissue_type n n_retained
fin 234 0.8974359
muscle 45 0.7111111
operculum punch 71 0.7605634
scale 10 0.8000000
a <- ggplot(data = raw2)+geom_density(aes(x = `On-Target Reads`, color = tissue_type, fill = tissue_type), alpha =0.5) + scale_fill_scico_d()+scale_color_scico_d()+theme_classic()+theme(legend.title = element_blank())

b <- ggplot(data = raw2)+geom_density(aes(x = `%On-Target`/100, color = tissue_type, fill = tissue_type), alpha = 0.5)+scale_fill_scico_d()+scale_color_scico_d()+theme_classic()+theme(legend.title = element_blank())+xlab("Proportion On-Target")

c <- ggplot(data = raw2)+geom_density(aes(x = 1-(`%GT`/100), color = tissue_type, fill = tissue_type), alpha = 0.5)+scale_fill_scico_d()+scale_color_scico_d()+theme_classic()+theme(legend.title = element_blank())+xlab("Proportion Missing Data")

cowplot::plot_grid(a,b,c, ncol = 1)

Only half of the scale samples in the metadata were actually multiplexed into the library (ask Cristin and Sandra why? ws it low DNA quality? if so that affects the interpretation of these results). Answer here: only extracted 10 of the scales, wanted to save the samples if extractions didn’t work.

Scale samples have roughly 1/2 the number and proportion of on target reads, but similar rates of average missing data. There was not a significant differnence in the proportion of on-target reads. There was an interesting pattern of variance in the genotyping success of scale vs fin samples. While both tissue types had similar proportions of samples that completely failed genotypin (>20% missing data, about 20% of samples from each), there was much higher variance among scale samples in missing data rate among samples with <20% missing data.

Together these results suggest that GTseq study using scale samples is feasible, but we should be cautious about depths. Scale samples will require approximately 2x depth to reach a similar number of on-target reads as fresh samples.

extras

cohort comparisons

is there a difference between 2020 and 2013 yaquina samples?

genind_pop <- seppop(genind_full)

genind_pop$Yaquina@pop <- as.factor(case_when( str_detect(rownames(genind_pop$Yaquina$tab ), "OkeCC13") ~ "scale",
str_detect(rownames(genind_pop$Yaquina$tab ), "OkeCC19") ~ "fin",                                                   ))

#dapc
dapc_yaq <- dapc(genind_pop$Yaquina, n.pca = 8, n.da = 1 ) #limit to less than sample size for n.pcs
optim.a.score(dapc_yaq, smart = FALSE)

## $pop.score
## $pop.score$`1`
##          fin        scale 
##  0.001470588 -0.012500000 
## 
## $pop.score$`2`
##          fin        scale 
##  0.001470588 -0.012500000 
## 
## $pop.score$`3`
##         fin       scale 
## 0.001470588 0.000000000 
## 
## $pop.score$`4`
##          fin        scale 
##  0.002941176 -0.037500000 
## 
## $pop.score$`5`
##          fin        scale 
##  0.002941176 -0.062500000 
## 
## $pop.score$`6`
##          fin        scale 
##  0.005882353 -0.075000000 
## 
## $pop.score$`7`
##          fin        scale 
##  0.004411765 -0.062500000 
## 
## $pop.score$`8`
##         fin       scale 
##  0.01029412 -0.10000000 
## 
## 
## $mean
##             1             2             3             4             5 
## -0.0055147059 -0.0055147059  0.0007352941 -0.0172794118 -0.0297794118 
##             6             7             8 
## -0.0345588235 -0.0290441176 -0.0448529412 
## 
## $best
## 3 
## 3
dapc_yaq <- dapc(genind_pop$Yaquina, n.pca = 1, n.da = 1 )
scatter.dapc(dapc_yaq)

#nope no difference detectable even by DAPC
Xyaq <- scaleGen(genind_pop$Yaquina, NA.method="mean")
## Warning in .local(x, ...): Some scaling values are null.
##  Corresponding alleles are removed.
#lets do a pca just in case
#then run pca
pcayaq <- dudi.pca(Xyaq, scale = FALSE, scannf = FALSE, nf = 227)

#kept all PCs
snp_pcs_yaq <- pcayaq$li#[,c(1:kg)]

#now plot data


snp_pcs_yaq %<>%
  rownames_to_column("sample") %>%
  left_join(as.data.frame(cbind(rownames(genind_pop$Yaquina$tab), as.character(genind_pop$Yaquina$pop))), by = c("sample" = "V1")) 

ggplot(data = snp_pcs_yaq)+geom_point(aes(Axis1, Axis2, color = V2)) + stat_ellipse(aes(Axis1, Axis2, color = V2)) +theme_classic()+scale_color_scico_d()

#lets plot a pca just in case